Linear regression with gluon



In [1]:

    
from __future__ import print_function
from tqdm import tqdm
import mxnet as mx
from mxnet import gluon



In [2]:

    
# Set context
data_ctx = mx.cpu()
model_ctx = mx.cpu()



In [3]:

    
num_inputs = 2
num_outputs = 1
num_examples = 10000



In [4]:

    
W1_real = 2.0
W2_real = -3.4
b_real = 4.2



In [5]:

    
def real_fn(X):
    return W1_real * X[:, 0] + W2_real * X[:, 1] + b_real



In [6]:

    
X = mx.nd.random_normal(shape=(num_examples, num_inputs))
noise = 0.01 * mx.nd.random_normal(shape=(num_examples,))
y = real_fn(X) + noise



In [7]:

    
# Data iterator
batch_size = 4
train_data = gluon.data.DataLoader(gluon.data.ArrayDataset(X, y),
                                   batch_size=batch_size, 
                                   shuffle=True)



In [8]:

    
# Define the model
net = gluon.nn.Dense(in_units=2, units=1)



In [9]:

    
print(net.weight)
print(net.bias)









    



Parameter dense0_weight (shape=(1, 2), dtype=float32)
Parameter dense0_bias (shape=(1,), dtype=float32)

Collecting parameters



In [10]:

    
# By default, weights and biases are of type Parameter.
print(type(net.weight))
print(type(net.bias))









    



<class 'mxnet.gluon.parameter.Parameter'>
<class 'mxnet.gluon.parameter.Parameter'>



In [11]:

    
# After collecting the parameters, we obtain ParameterDict(ionary)
net.collect_params()









    Out[11]:





dense0_ (
  Parameter dense0_weight (shape=(1, 2), dtype=float32)
  Parameter dense0_bias (shape=(1,), dtype=float32)
)



In [12]:

    
type(net.collect_params())









    Out[12]:





mxnet.gluon.parameter.ParameterDict

Initialize parameters



In [13]:

    
# Initialize with standard normal distribution
net.collect_params().initialize(mx.init.Normal(sigma=1.0), ctx=model_ctx)

Accessing parameters of the network



In [14]:

    
print(net.weight.data())
print(net.bias.data())









    



[[0.93444026 0.5380863 ]]
<NDArray 1x2 @cpu(0)>

[0.]
<NDArray 1 @cpu(0)>

Passing the data to the model



In [15]:

    
example_data = mx.nd.array([[4,7]])



In [16]:

    
# "Prediction"
net(example_data)









    Out[16]:





[[7.504365]]
<NDArray 1x1 @cpu(0)>

Defining the network (again)



In [17]:

    
# It is not crucial to define in_units
# The parameters will be initialized when the data flows through it for the first time.
net = gluon.nn.Dense(units=1)
net.collect_params().initialize(mx.init.Normal(sigma=1.), ctx=model_ctx)

Defining the loss



In [18]:

    
# Defining the loss as squared error
square_loss = gluon.loss.L2Loss()

Defining the optimizer



In [19]:

    
trainer = gluon.Trainer(params=net.collect_params(), 
                        optimizer='sgd', 
                        optimizer_params={'learning_rate': 0.0001})

Training loop



In [20]:

    
epochs = 20
loss_sequence = []
num_batches = num_examples / batch_size

for e in range(epochs):
    cumulative_loss = 0
    # Iterating over the batches
    for i, (data, label) in tqdm(enumerate(train_data)):
        data = data.as_in_context(model_ctx)
        label = label.as_in_context(model_ctx)
        with mx.autograd.record():
            output = net(data)
            loss = square_loss(output, label)
        loss.backward()
        trainer.step(batch_size)
        cumulative_loss += mx.nd.mean(loss).asscalar()
    print("Epoch %s, loss: %s" % (e, cumulative_loss / num_examples))
    loss_sequence.append(cumulative_loss)









    



2500it [00:05, 469.05it/s]






    



Epoch 0, loss: 4.044808251080662






    



2500it [00:04, 516.15it/s]






    



Epoch 1, loss: 2.4210833267018197






    



2500it [00:04, 569.99it/s]






    



Epoch 2, loss: 1.449182435182482






    



2500it [00:04, 539.26it/s]






    



Epoch 3, loss: 0.8674455086305738






    



2500it [00:05, 477.87it/s]






    



Epoch 4, loss: 0.5192347130089998






    



2500it [00:04, 546.39it/s]






    



Epoch 5, loss: 0.31080911717228593






    



2500it [00:04, 543.77it/s]






    



Epoch 6, loss: 0.18604981883727015






    



2500it [00:04, 544.60it/s]






    



Epoch 7, loss: 0.11137253744467161






    



2500it [00:04, 550.24it/s]






    



Epoch 8, loss: 0.06667259070305154






    



2500it [00:04, 555.24it/s]






    



Epoch 9, loss: 0.03991518984034192






    



2500it [00:04, 535.74it/s]






    



Epoch 10, loss: 0.02389857202839339






    



2500it [00:04, 557.72it/s]






    



Epoch 11, loss: 0.014310872454661876






    



2500it [00:04, 562.10it/s]






    



Epoch 12, loss: 0.008571591195501969






    



2500it [00:04, 549.72it/s]






    



Epoch 13, loss: 0.005135956158919725






    



2500it [00:04, 556.33it/s]






    



Epoch 14, loss: 0.0030793819638696733






    



2500it [00:04, 505.95it/s]






    



Epoch 15, loss: 0.0018482797156397282






    



2500it [00:04, 547.82it/s]






    



Epoch 16, loss: 0.00111145593189176






    



2500it [00:04, 579.36it/s]






    



Epoch 17, loss: 0.0006702495257908595






    



2500it [00:04, 533.51it/s]






    



Epoch 18, loss: 0.00040625386701976824






    



2500it [00:04, 560.46it/s]






    



Epoch 19, loss: 0.00024821289024657745

Getting the parameters



In [21]:

    
# ParameterDict
params = net.collect_params()



In [22]:

    
for param in params.values():
    print(param.name, param.data())









    



dense1_weight 
[[ 1.9863372 -3.3749702]]
<NDArray 1x2 @cpu(0)>
dense1_bias 
[4.175586]
<NDArray 1 @cpu(0)>



In [23]:

    
list(params.values())[0].data()









    Out[23]:





[[ 1.9863372 -3.3749702]]
<NDArray 1x2 @cpu(0)>



In [24]:

    
[W1, W2] = list(params.values())[0].data()[0]



In [25]:

    
b = list(params.values())[1].data()[0]



In [26]:

    
# Learned values
print(W1_real)
print(W2_real)
print(b_real)



In [27]:

    
# True values
print(W1.asscalar())
print(W2.asscalar())
print(b.asscalar())